# Numeric Univariate Analysis 


# Importing the Data

```r
#install.packages(c("FactoMineR", "factoextra"))
library("FactoMineR") 
library("factoextra")
data(decathlon2)
head(decathlon2)
##           X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE    11.04      7.58    14.83      2.07 49.81        14.69  43.75
## CLAY      10.76      7.40    14.26      1.86 49.37        14.05  50.72
## BERNARD   11.02      7.23    14.25      1.92 48.93        14.99  40.87
## YURKOV    11.34      7.09    15.19      2.10 50.42        15.31  46.26
## ZSIVOCZKY 11.13      7.30    13.48      2.01 48.62        14.17  45.67
## McMULLEN  10.83      7.31    13.76      2.13 49.91        14.38  44.41
##           Pole.vault Javeline X1500m Rank Points Competition
## SEBRLE          5.02    63.19  291.7    1   8217    Decastar
## CLAY            4.92    60.15  301.5    2   8122    Decastar
## BERNARD         5.32    62.77  280.1    4   8067    Decastar
## YURKOV          4.72    63.44  276.4    5   8036    Decastar
## ZSIVOCZKY       4.42    55.37  268.0    7   8004    Decastar
## McMULLEN        4.42    56.37  285.1    8   7995    Decastar

1 Inspecting the Data

– Number of rows, Columns
– Variables - type, Values

library(tibble)
glimpse(decathlon2)
## Rows: 27
## Columns: 13
## $ X100m        <dbl> 11.04, 10.76, 11.02, 11.34, 11.13, 10.83, 11.64, 11.37, 1…
## $ Long.jump    <dbl> 7.58, 7.40, 7.23, 7.09, 7.30, 7.31, 6.81, 7.56, 6.97, 7.2…
## $ Shot.put     <dbl> 14.83, 14.26, 14.25, 15.19, 13.48, 13.76, 14.57, 14.41, 1…
## $ High.jump    <dbl> 2.07, 1.86, 1.92, 2.10, 2.01, 2.13, 1.95, 1.86, 1.95, 1.9…
## $ X400m        <dbl> 49.81, 49.37, 48.93, 50.42, 48.62, 49.91, 50.14, 51.10, 4…
## $ X110m.hurdle <dbl> 14.69, 14.05, 14.99, 15.31, 14.17, 14.38, 14.93, 15.06, 1…
## $ Discus       <dbl> 43.75, 50.72, 40.87, 46.26, 45.67, 44.41, 47.60, 44.99, 4…
## $ Pole.vault   <dbl> 5.02, 4.92, 5.32, 4.72, 4.42, 4.42, 4.92, 4.82, 4.72, 4.6…
## $ Javeline     <dbl> 63.19, 60.15, 62.77, 63.44, 55.37, 56.37, 52.33, 57.19, 5…
## $ X1500m       <dbl> 291.70, 301.50, 280.10, 276.40, 268.00, 285.10, 262.10, 2…
## $ Rank         <int> 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7,…
## $ Points       <int> 8217, 8122, 8067, 8036, 8004, 7995, 7802, 7733, 7708, 765…
## $ Competition  <fct> Decastar, Decastar, Decastar, Decastar, Decastar, Decasta…

2 Random sample of the dataframe

sample(decathlon2)
##             X110m.hurdle High.jump Javeline Shot.put Discus Points Long.jump
## SEBRLE             14.69      2.07    63.19    14.83  43.75   8217      7.58
## CLAY               14.05      1.86    60.15    14.26  50.72   8122      7.40
## BERNARD            14.99      1.92    62.77    14.25  40.87   8067      7.23
## YURKOV             15.31      2.10    63.44    15.19  46.26   8036      7.09
## ZSIVOCZKY          14.17      2.01    55.37    13.48  45.67   8004      7.30
## McMULLEN           14.38      2.13    56.37    13.76  44.41   7995      7.31
## MARTINEAU          14.93      1.95    52.33    14.57  47.60   7802      6.81
## HERNU              15.06      1.86    57.19    14.41  44.99   7733      7.56
## BARRAS             14.48      1.95    55.40    14.09  42.10   7708      6.97
## NOOL               15.29      1.98    57.44    12.68  37.92   7651      7.27
## BOURGUIGNON        15.67      1.86    54.68    13.46  40.49   7313      6.80
## Sebrle             14.05      2.12    70.52    16.36  48.72   8893      7.84
## Clay               14.13      2.06    69.71    15.23  50.11   8820      7.96
## Karpov             13.97      2.09    55.54    15.93  51.65   8725      7.81
## Macey              14.56      2.15    58.46    15.73  48.34   8414      7.47
## Warners            14.01      1.97    55.39    14.48  43.73   8343      7.74
## Zsivoczky          14.95      2.12    63.45    15.31  45.62   8287      7.14
## Hernu              14.25      2.03    57.76    14.65  44.72   8237      7.19
## Bernard            14.17      2.12    55.27    14.80  44.75   8225      7.48
## Schwarzl           14.25      1.94    56.32    14.01  42.43   8102      7.49
## Pogorelov          14.21      2.06    53.45    15.10  44.60   8084      7.31
## Schoenbeck         14.34      1.88    60.89    14.77  44.41   8077      7.30
## Barras             14.37      1.94    64.55    14.91  44.83   8067      6.99
## KARPOV             14.09      2.04    50.31    14.77  48.95   8099      7.30
## WARNERS            14.23      1.98    51.77    14.31  41.10   8030      7.60
## Nool               14.80      1.88    61.33    14.26  42.05   8235      7.53
## Drews              14.01      1.88    51.53    13.07  40.11   7926      7.38
##             Rank X400m Pole.vault Competition X100m X1500m
## SEBRLE         1 49.81       5.02    Decastar 11.04 291.70
## CLAY           2 49.37       4.92    Decastar 10.76 301.50
## BERNARD        4 48.93       5.32    Decastar 11.02 280.10
## YURKOV         5 50.42       4.72    Decastar 11.34 276.40
## ZSIVOCZKY      7 48.62       4.42    Decastar 11.13 268.00
## McMULLEN       8 49.91       4.42    Decastar 10.83 285.10
## MARTINEAU      9 50.14       4.92    Decastar 11.64 262.10
## HERNU         10 51.10       4.82    Decastar 11.37 285.10
## BARRAS        11 49.48       4.72    Decastar 11.33 282.00
## NOOL          12 49.20       4.62    Decastar 11.33 266.60
## BOURGUIGNON   13 51.16       5.02    Decastar 11.36 291.70
## Sebrle         1 48.36       5.00    OlympicG 10.85 280.01
## Clay           2 49.19       4.90    OlympicG 10.44 282.00
## Karpov         3 46.81       4.60    OlympicG 10.50 278.11
## Macey          4 48.97       4.40    OlympicG 10.89 265.42
## Warners        5 47.97       4.90    OlympicG 10.62 278.05
## Zsivoczky      6 49.40       4.70    OlympicG 10.91 269.54
## Hernu          7 48.73       4.80    OlympicG 10.97 264.35
## Bernard        9 49.13       4.40    OlympicG 10.69 276.31
## Schwarzl      10 49.76       5.10    OlympicG 10.98 273.56
## Pogorelov     11 50.79       5.00    OlympicG 10.95 287.63
## Schoenbeck    12 50.30       5.00    OlympicG 10.90 278.82
## Barras        13 49.41       4.60    OlympicG 11.14 267.09
## KARPOV         3 48.37       4.92    Decastar 11.02 300.20
## WARNERS        6 48.68       4.92    Decastar 11.11 278.10
## Nool           8 48.81       5.40    OlympicG 10.80 276.33
## Drews         19 48.51       5.00    OlympicG 10.87 274.21

3 Summary of all the variables of the dataframe

summary(decathlon2)
##      X100m         Long.jump        Shot.put       High.jump    
##  Min.   :10.44   Min.   :6.800   Min.   :12.68   Min.   :1.860  
##  1st Qu.:10.84   1st Qu.:7.210   1st Qu.:14.17   1st Qu.:1.930  
##  Median :10.97   Median :7.310   Median :14.57   Median :1.980  
##  Mean   :10.99   Mean   :7.365   Mean   :14.54   Mean   :1.998  
##  3rd Qu.:11.13   3rd Qu.:7.545   3rd Qu.:15.01   3rd Qu.:2.080  
##  Max.   :11.64   Max.   :7.960   Max.   :16.36   Max.   :2.150  
##      X400m        X110m.hurdle       Discus        Pole.vault   
##  Min.   :46.81   Min.   :13.97   Min.   :37.92   Min.   :4.400  
##  1st Qu.:48.70   1st Qu.:14.15   1st Qu.:42.27   1st Qu.:4.660  
##  Median :49.20   Median :14.34   Median :44.72   Median :4.900  
##  Mean   :49.31   Mean   :14.50   Mean   :44.85   Mean   :4.836  
##  3rd Qu.:49.86   3rd Qu.:14.87   3rd Qu.:46.93   3rd Qu.:5.000  
##  Max.   :51.16   Max.   :15.67   Max.   :51.65   Max.   :5.400  
##     Javeline         X1500m           Rank            Points       Competition
##  Min.   :50.31   Min.   :262.1   Min.   : 1.000   Min.   :7313   Decastar:13  
##  1st Qu.:55.32   1st Qu.:271.6   1st Qu.: 4.000   1st Qu.:8000   OlympicG:14  
##  Median :57.19   Median :278.1   Median : 7.000   Median :8084                
##  Mean   :58.32   Mean   :278.5   Mean   : 7.444   Mean   :8119                
##  3rd Qu.:62.05   3rd Qu.:283.6   3rd Qu.:10.500   3rd Qu.:8236                
##  Max.   :70.52   Max.   :301.5   Max.   :19.000   Max.   :8893
#https://stackoverflow.com/questions/50848273/call-many-variables-in-a-for-loop-with-dplyr-ggplot-function
plotUniCat <- function(df, x) {
  x <- sym(x)
  df %>%
    filter(!is.na(!!x)) %>%
    count(!!x) %>%
    mutate(prop = prop.table(n)) %>%
    ggplot(aes(y=prop, x=!!x)) +
    geom_bar(stat = "identity")
}

4 Checking the column names of the dataframe

colnames(decathlon2)
##  [1] "X100m"        "Long.jump"    "Shot.put"     "High.jump"    "X400m"       
##  [6] "X110m.hurdle" "Discus"       "Pole.vault"   "Javeline"     "X1500m"      
## [11] "Rank"         "Points"       "Competition"

5 Inspecting the structure of the dataframe

str(decathlon2)
## 'data.frame':    27 obs. of  13 variables:
##  $ X100m       : num  11 10.8 11 11.3 11.1 ...
##  $ Long.jump   : num  7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
##  $ Shot.put    : num  14.8 14.3 14.2 15.2 13.5 ...
##  $ High.jump   : num  2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
##  $ X400m       : num  49.8 49.4 48.9 50.4 48.6 ...
##  $ X110m.hurdle: num  14.7 14.1 15 15.3 14.2 ...
##  $ Discus      : num  43.8 50.7 40.9 46.3 45.7 ...
##  $ Pole.vault  : num  5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
##  $ Javeline    : num  63.2 60.1 62.8 63.4 55.4 ...
##  $ X1500m      : num  292 302 280 276 268 ...
##  $ Rank        : int  1 2 4 5 7 8 9 10 11 12 ...
##  $ Points      : int  8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
##  $ Competition : Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...

6 Readying the Data for univariate distributions plotting of numeric variables

library(dplyr)
data_num <- decathlon2 %>% select_if(is.numeric)
str(data_num)
## 'data.frame':    27 obs. of  12 variables:
##  $ X100m       : num  11 10.8 11 11.3 11.1 ...
##  $ Long.jump   : num  7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
##  $ Shot.put    : num  14.8 14.3 14.2 15.2 13.5 ...
##  $ High.jump   : num  2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
##  $ X400m       : num  49.8 49.4 48.9 50.4 48.6 ...
##  $ X110m.hurdle: num  14.7 14.1 15 15.3 14.2 ...
##  $ Discus      : num  43.8 50.7 40.9 46.3 45.7 ...
##  $ Pole.vault  : num  5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
##  $ Javeline    : num  63.2 60.1 62.8 63.4 55.4 ...
##  $ X1500m      : num  292 302 280 276 268 ...
##  $ Rank        : int  1 2 4 5 7 8 9 10 11 12 ...
##  $ Points      : int  8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
variables <- colnames(data_num)
out <- lapply(variables, function(i) plotUniCat(decathlon2,i))

7 Creating histograms for the columns in the dataframe

#https://stackoverflow.com/questions/17963962/plot-size-and-resolution-with-r-markdown-knitr-pandoc-beamer
par(mfrow=c(4, 3))
for (i in names(data_num)){ 
  hist(data_num[, i],xlab = (i))}

8 Creating histogram (frequency) for all the columns in the dataframe

par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE, xlab= paste0(i), ylim = c(0,20),ylab = "frequency")}

9 Creating histogram (frequency) for all the columns in the dataframe

par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE, xlab= paste0(i))}

10 Creating density plot for all the columns in the dataframe

par(mfrow=c(4, 3))
for (i in names(data_num)){
plot(density(data_num[, i]), main = paste0(i), xlab= paste0(i))
}

11 Bivariate Relationships and Correlation plots

library(psych)
pairs.panels(data_num, col="red")

#methods(class = class(decathlon2[,'Competition']))
methods(class = 'factor')
##  [1] [             [[            [[<-          [<-           all.equal    
##  [6] as.character  as.data.frame as.Date       as.list       as.logical   
## [11] as.POSIXlt    as.vector     c             coerce        droplevels   
## [16] format        initialize    is.na<-       length<-      levels<-     
## [21] Math          Ops           plot          print         recode       
## [26] relevel       relist        rep           scale_type    show         
## [31] slotsFromS3   summary       Summary       type_sum      xtfrm        
## see '?methods' for accessing help and source code
levels(decathlon2[,'Competition'])
## [1] "Decastar" "OlympicG"
nlevels(decathlon2[,'Competition'])
## [1] 2
summary(decathlon2[,'Competition'])
## Decastar OlympicG 
##       13       14

#Correlation Matrix with GGally

library(GGally)
# Check correlations (as scatterplots), distribution and print corrleation coefficient 
ggpairs(data_num, title="correlogram with ggpairs()") 

library(GGally)
# Nice visualization of correlations
ggcorr(data_num, method = c("everything", "pearson"))  

# https://www.r-graph-gallery.com/199-correlation-matrix-with-ggally.html
# Quick display of two cabapilities of GGally, to assess the distribution and correlation of variables 
library(GGally)
 
# From the help page:
data(flea)
head(flea)
##    species tars1 tars2 head aede1 aede2 aede3
## 1 Concinna   191   131   53   150    15   104
## 2 Concinna   185   134   50   147    13   105
## 3 Concinna   200   137   52   144    14   102
## 4 Concinna   173   127   50   144    16    97
## 5 Concinna   171   118   49   153    13   106
## 6 Concinna   160   118   47   140    15    99
ggpairs(flea, columns = 2:4, ggplot2::aes(colour=species)) 

ggpairs(decathlon2, columns = 1:12, ggplot2::aes(colour=Competition))